import pandas as pd
import numpy as np
import sqlite3
pd.set_option('display.max_columns', None)
import matplotlib as plt
import plotly.express as px
from scipy import stats
import sqlite3
# query data frame with store and transaction info
conn = sqlite3.connect("data/liquor.db")
query = '''
SELECT Date, t."Store Number", Sale, "Volume Sold", County, "Store Type"
FROM transactions AS t
LEFT JOIN stores AS s ON t."Store Number" = s."Store Number";
'''
store_df = pd.read_sql_query(query, conn)
conn.close()
store_df.head(3)
# Make Date column datetime
store_df.loc[:,'Date']=pd.to_datetime(store_df.Date)
store_df.head()
# Combined the Categories to Other for final presentation
store_df['Store Type'] = store_df['Store Type'].replace(['Other', 'Other Grocery or Convenience', 'Casino'], 'T Other')
store_df.head()
import datetime
# Create df grouped by year summing volume sales
store_line_vol_y= store_df.groupby([pd.Grouper(key='Date', freq='Y')]).agg({'Volume Sold':'sum'}).reset_index()
# Create column Year
store_line_vol_y['Year'] = pd.DatetimeIndex(store_line_vol_y['Date']).year
# Convert Volume column to millions of dollars
store_line_vol_y['Volume Sold'] = store_line_vol_y['Volume Sold']/1000000
# Create df group by Y and Store type sum volume sold
store_bar_p_y = store_df.groupby([pd.Grouper(key='Date', freq='Y'),'Store Type']).agg({'Volume Sold':'sum'}).reset_index()
# Create column Percent for normalized store volumes sold
store_bar_p_y['Percent'] = store_bar_p_y.groupby([pd.Grouper(key='Date', freq='Y')]).transform(lambda x: (x/sum(x))*17.5)
# Create year column
store_bar_p_y['Year'] = pd.DatetimeIndex(store_bar_p_y['Date']).year
# Merge dataframes
store_r = pd.merge(store_bar_p_y,store_line_vol_y,how='left', on= 'Year')
# Get rid of 2020 data
store_r = store_r.loc[store_r.Year!=2020]
store_r['Store Type'] = store_r['Store Type'].replace(['T Other'], 'Other')
# and write to csv for R
store_r.to_csv('data/store_r2.csv',index=False)
store_df.describe()
store_type_g = store_df.groupby('Store Type')
store_type_g.describe()
# Create grouped object that groups by Month
store_m_g = store_df.groupby([pd.Grouper(key='Date',freq='M')])
# histogram of overall monthly sales volume
fig = px.histogram(store_m_g.sum().reset_index(),
x='Volume Sold', nbins=25, title='Overall Monthly sales Volume')
fig
# Create grouped object that groups by store type and Month
store_type_m_g = store_df.groupby(['Store Type',
pd.Grouper(key='Date',
freq='M')])
# histogram of store monthly sales volume divided by store type
fig = px.histogram(store_type_m_g.sum().reset_index(),
x='Volume Sold',
facet_col='Store Type',
facet_col_wrap=2)
fig
# look at the quarterly sales per store and get histogram
store_nu_q_g = store_df.groupby(['Store Number', pd.Grouper(key='Date',freq='Q')])
fig = px.histogram(store_nu_q_g.sum().reset_index(),
x='Volume Sold', nbins=30, title='Monthly Volume Sold per Store')
fig
# Volume Sold by Store type
# Supermarkets sell the most liquor
store_type_g['Volume Sold'].sum().sort_values(ascending=False).plot.bar()
# boxplots showing montly sales volume category may be better way to represent volume sales by store type
fig = px.box(store_type_m_g.sum().sort_values(by='Volume Sold',ascending=False).reset_index(),
x = 'Store Type',
y ='Volume Sold',
title='Total Monthly Sales Volume by Store Type')
fig
store_m_ty_no_g_ave=store_df.groupby(['Store Type', pd.Grouper(key='Date', freq='M'), 'Store Number']).sum().reset_index().groupby(['Store Type', pd.Grouper(key='Date', freq='M')]).mean().reset_index()
store_ave_map = {'Supermarket':1, 'Casino':6, 'Other':4, 'Drug Store':3, 'Liquor Tobacco Store':2, 'Other Grocery or Convenience':5, 'Convenience Store':7}
store_m_ty_no_g_ave['Order'] = store_m_ty_no_g_ave['Store Type'].map(store_ave_map)
fig = px.box(store_m_ty_no_g_ave.sort_values(by='Order'),
x = 'Store Type',
y ='Volume Sold',
title='Average Monthly Store Sales Volume')
fig.update_layout(yaxis_title="Volume Sold (Liters)",
title={
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
store_df.head()
tt_store_mean= store_type_m_g.mean().reset_index()
store_ave_map = {'Supermarket':1, 'Casino':2, 'Other':3, 'Drug Store':4, 'Liquor Tobacco Store':5, 'Other Grocery or Convenience':6, 'Convenience Store':7}
tt_store_mean["Order"] = tt_store_mean["Store Type"].map(store_ave_map)
# Supermarket, Casino, Drug Store, Liquor, and Convenience Store
# are significantly different from each other
px.box(tt_store_mean.sort_values(by='Order').reset_index(),
x = 'Store Type',
y ='Volume Sold',
title='Average Monthly Store Sales Volume by Store Type')
from scipy import stats
# Casino and Supermarket are significantly different
# pvalue=0.02681811922044221
stats.ttest_ind(tt_store_mean.loc[tt_store_mean['Store Type']=='Casino']['Volume Sold'],tt_store_mean.loc[tt_store_mean['Store Type']=='Supermarket']['Volume Sold'])
# Casino and Other are not significantly different
# pvalue=0.50463932688488054
stats.ttest_ind(tt_store_mean.loc[tt_store_mean['Store Type']=='Casino']['Volume Sold'],tt_store_mean.loc[tt_store_mean['Store Type']=='Other']['Volume Sold'])
# Casino and Drug Store are significantly different
# pvalue=7.41395120593647e-17
stats.ttest_ind(tt_store_mean.loc[tt_store_mean['Store Type']=='Casino']['Volume Sold'],tt_store_mean.loc[tt_store_mean['Store Type']=='Drug Store']['Volume Sold'])
# Liquor and Drug Store are significantly different
# pvalue=3.888875737051862e-15
stats.ttest_ind(tt_store_mean.loc[tt_store_mean['Store Type']=='Drug Store']['Volume Sold'],tt_store_mean.loc[tt_store_mean['Store Type']=='Liquor Tobacco Store']['Volume Sold'])
tt_store_mean.loc[tt_store_mean['Store Type']=='Casino']['Volume Sold']
tt_store_mean
# GROUP BY YEAR AND SEE TREND
px.line(store_df.groupby(['Store Type', pd.Grouper(key='Date', freq='Y')])[['Volume Sold']].sum().reset_index(),x='Date', y='Volume Sold', color = 'Store Type', title='Volume Sold by Month and Store Type')
fig = px.line(store_df.groupby(['Store Type', pd.Grouper(key='Date', freq='Q')])[['Volume Sold']].sum().reset_index(),x='Date', y='Volume Sold', color = 'Store Type',title='Total Quarterly Sales Volume')
fig.update_layout(yaxis_title="Volume Sold (Liters)",
title={
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
# Use groupby defined earlier
store_vol = store_type_m_g[['Volume Sold']].sum().reset_index()
# Pivot to get line graph
store_vol = store_vol.pivot(index = 'Date',
columns = 'Store Type',
values = 'Volume Sold').reset_index()
# Figure showing volume sold by store category over the months
# Takeaway: Seasonality shows up in all categories except Casino(too small).
# Largest fluctuations seem to result from Grocery stores then Liquor and then Drug stores.
# Convenience store sales might be on the rise
fig = px.line(store_vol, x='Date', y=store_vol.columns, title='Volume Sold by Month and Store Type')
fig
fig = px.line(store_m_g.sum().reset_index(),
x='Date',
y='Volume Sold',
title='Total Volume Sold by Month')
fig
# Overall upward trend
New stores pop up and some of them shut down after a few years. Analyze the store opening dynamics and report the findings on store life-cycle, store survival analysis, survial curve, survival probabilities, etc. Refine your findings based on the store categories you design. Is it easy for the stores to experience sales volume growth? With new stores popping up for competition, how does it affect the sales of the existing stores? Does the pie (Gross sales volume in Iowa) grow bigger? Do the stores share the bigger pie? Does the winner take all or more players share smaller slices
store_lon = store_df.groupby(['Store Number']).agg({'Date':
(lambda x: (max(x)-min(x)).days)}).sort_values(by='Date',
ascending=False).reset_index()
store_lon.describe()
fig = px.histogram(store_lon['Date'], nbins=30)
fig.update_layout(
title='Store Longevity',
xaxis_title="Number of Days Open",
legend_title="",
)
fig
store_t_lon = store_df.groupby(['Store Type',
'Store Number']).agg({'Date':
(lambda x:
(max(x)-min(x)).days)}).sort_values(by='Date',
ascending=False).reset_index()
store_t_lon.head(3)
fig = px.box(store_t_lon,
x = 'Store Type',
y ='Date',
title='Longevity by Store Type')
fig
# doesnt take into consideration stores that opened more recently
store_df.head()
# count number of unique stores each time period (m)
# overlay number of stores closed as well (no purchase for 6 months)
# think about the ratio of closure/open
fig = px.line(store_m_g.agg({'Store Number': pd.Series.nunique}).reset_index(),
x='Date', y='Store Number', title='Store Growth')
fig
# For some stores with long longevity how sales change over time
store_type_nu = store_type_m_g.agg({'Store Number': pd.Series.nunique}).reset_index().pivot(index= 'Date', columns = 'Store Type', values = 'Store Number').reset_index()
fig = px.line(store_type_nu, x='Date', y=store_type_nu.columns, title='Number of Stores by Store Type')
fig
# Number of convenience stores is growing rapidly
fig = px.line(store_type_q_g.agg({'Store Number': pd.Series.nunique}).reset_index(), x='Date', y='Store Number',color= 'Store Type', title='Store Number Growth')
fig.update_layout(yaxis_title="Number of Stores",
title={
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
store_type_q_g
store_type_q_g.sum()
store_type_m_g_vol = store_type_m_g.agg({'Volume Sold': 'sum','Store Number': pd.Series.nunique}).reset_index()
store_type_m_g_vol['AveStoreSale'] = store_type_m_g_vol['Volume Sold']/store_type_m_g_vol['Store Number']
px.line(store_type_m_g_vol, x='Date', y='AveStoreSale', color='Store Type', title='Average Volume Sold per Store by Store Type')
store_type_q_g = store_df.groupby(['Store Type', pd.Grouper(key='Date', freq='Q')])
store_type_q_g_vol = store_type_q_g.agg({'Volume Sold': 'sum','Store Number': pd.Series.nunique}).reset_index()
store_type_q_g_vol['AveStoreSale'] = store_type_q_g_vol['Volume Sold']/store_type_q_g_vol['Store Number']
fig =px.line(store_type_q_g_vol, x='Date', y='AveStoreSale', color='Store Type',title='Per Store Average Quarterly Sales Volume')
fig.update_layout(yaxis_title="Volume Sold (Liters)",
title={
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
when have Xiaogang data - do some exploration of diff between two for price of vodka
top 20 products and stacked barchart of types?
conn = sqlite3.connect("data/liquor.db")
query = '''
SELECT *
FROM transactions AS t
LEFT JOIN products AS p ON t."Item Number" = p."Item Number"
LIMIT(1)
'''
df = pd.read_sql_query(query, conn)
conn.close()
df.head(3)
conn = sqlite3.connect("data/liquor.db")
query = '''
SELECT Date, p.product_name, category_new, "Volume Sold", proof, Sale
FROM transactions AS t
LEFT JOIN products AS p ON t."Item Number" = p."Item Number"
'''
df = pd.read_sql_query(query, conn)
conn.close()
df.head(3)
df.loc[:,'Date']=pd.to_datetime(df.Date)
prod = df.loc[df.category_new=='vodka'].groupby([pd.Grouper(key='Date', freq='M'),'product_name']).agg({'Volume Sold':'sum', 'Sale':'sum', 'proof':'median'}).reset_index()
prod['PriceLiter'] = prod['Sale']/prod['Volume Sold']
prod.head()
prod.loc[prod['product_name']=='titos handmade vodka']
px.line(prod.loc[prod['product_name']=='hawkeye vodka'], x='Date', y= 'PriceLiter' )
px.line(prod.loc[prod['product_name']=='titos handmade vodka'], x='Date', y= 'PriceLiter' )
prod.loc[prod['product_name']=='hawkeye vodka']
topvod=prod.groupby('product_name').agg({'Volume Sold':'sum'}).sort_values(by='Volume Sold',ascending=False).reset_index().head(20)
topvodlist = topvod.product_name.tolist()
px.line(prod.loc[prod['product_name'].isin(topvodlist)], x='Date', y='PriceLiter', color = 'product_name')
prod.loc[prod['product_name'].isin(topvodlist)].describe()
# need to get rid of some zeros in volume sold for all vodkas
prod.loc[prod['Volume Sold']!=0].describe()
from scipy import stats
# Top 20 PriceLiter is significantly different from rest of vodka
stats.ttest_ind(prod.loc[prod['Volume Sold']!=0]['PriceLiter'],prod.loc[prod['product_name'].isin(topvodlist)]['PriceLiter'])
# Top 20 proof is significantly different from rest of vodka
stats.ttest_ind(prod.loc[~np.isnan(prod.proof)]['proof'],prod.loc[(prod['product_name'].isin(topvodlist))&(~np.isnan(prod.proof))]['proof'])